{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib\n",
    "import numpy as np\n",
    "from mpmath import mp\n",
    "import matplotlib.pyplot as plt\n",
    "from mpl_toolkits.axes_grid1 import host_subplot\n",
    "import mpl_toolkits.axisartist as AA\n",
    "import math\n",
    "import warnings\n",
    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
    "plt.rcParams['figure.dpi'] = 180\n",
    "plt.rcParams['figure.figsize'] = [12.0, 8.0]\n",
    "plt.rcParams['text.latex.unicode'] = True\n",
    "plt.rcParams['text.usetex'] = True\n",
    "plt.rcParams['mathtext.fontset'] = 'stix'\n",
    "plt.rcParams['font.family'] = 'STIXGeneral'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "log_sizes = np.array([i for i in range(1,35)])\n",
    "sizes = np.array([2**i for i in range(1,35)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Register file size is speculative\n",
    "# Defaults are for AWS EC2 c5.2xlarge, an \"Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz\"\n",
    "def init_plot(reg_file=128 * 8, l1=32*1024, l2=1024**2, l3=24 * 1024**2, ram=16 * 1024**3, disk=256 * 1024**3):\n",
    "    plt.xlabel('Size $\\\\left[\\\\log_2 n\\\\right]$')\n",
    "    plt.xscale('log')\n",
    "    plt.xticks(sizes, [str(n) for n in log_sizes])\n",
    "    plt.gca().xaxis.set_minor_locator(plt.NullLocator())\n",
    "    \n",
    "    def mem_line(size, label):\n",
    "        plt.axvline(size / 32, color='grey', linestyle='--')\n",
    "        plt.text(size / 32, 100, label)\n",
    "    \n",
    "    mem_line(reg_file, \"REG\")\n",
    "    mem_line(l1, \"L1\")\n",
    "    mem_line(l2, \"L2\")\n",
    "    mem_line(l3, \"L3\")\n",
    "    mem_line(ram, \"RAM\")\n",
    "    mem_line(disk, \"DISK\")\n",
    "\n",
    "    plt.yscale('log')\n",
    "    plt.ylabel('Time $\\\\left[\\\\mathtt{sec}\\\\right]$')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def series(data, color='tab:blue'):\n",
    "    plt.plot(sizes[:len(data)], data, color=color, marker='.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Benchmark c5.2xlarge"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": true,
    "toc-nb-collapsed": true
   },
   "source": [
    "On AWS EC2 instance type `c5.2xlarge`. Root drive size increased to 256GB and a 64GB swapfile is added. Using `--allocation heap`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft = np.fromstring(\"\"\"\n",
    "1\t0.00000531\n",
    "2\t0.000004477\n",
    "3\t0.000007344\n",
    "4\t0.000009439\n",
    "5\t0.000013781\n",
    "6\t0.000023936\n",
    "7\t0.000042144\n",
    "8\t0.000077292\n",
    "9\t0.000149075\n",
    "10\t0.000309063\n",
    "11\t0.000690227\n",
    "12\t0.001507601\n",
    "13\t0.00330798\n",
    "14\t0.00717378\n",
    "15\t0.002854655\n",
    "16\t0.005344267\n",
    "17\t0.010803624\n",
    "18\t0.022976793\n",
    "19\t0.046437339\n",
    "20\t0.097819326\n",
    "21\t0.203701168\n",
    "22\t0.424807287\n",
    "23\t0.907349772\n",
    "24\t1.858657419\n",
    "25\t4.064888369\n",
    "26\t8.003252785\n",
    "27\t17.67193235\n",
    "28\t34.221835982\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft_sqrt = np.fromstring(\"\"\"\n",
    "1\t0.000008935\n",
    "2\t0.000038013\n",
    "3\t0.000023374\n",
    "4\t0.000026747\n",
    "5\t0.000029887\n",
    "6\t0.00004039\n",
    "7\t0.000048373\n",
    "8\t0.000072872\n",
    "9\t0.000123616\n",
    "10\t0.000250539\n",
    "11\t0.000507387\n",
    "12\t0.001089775\n",
    "13\t0.002354191\n",
    "14\t0.005096965\n",
    "15\t0.010595209\n",
    "16\t0.022575648\n",
    "17\t0.0473638\n",
    "18\t0.100595503\n",
    "19\t0.211823046\n",
    "20\t0.446282983\n",
    "21\t0.938822169\n",
    "22\t1.9707616190000001\n",
    "23\t4.142269804\n",
    "24\t8.607713068\n",
    "25\t18.171836223\n",
    "26\t37.403999458\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft_rec = np.fromstring(\"\"\"\n",
    "1\t0.000006159\n",
    "2\t0.000004487\n",
    "3\t0.000006596\n",
    "4\t0.000008758\n",
    "5\t0.000011726\n",
    "6\t0.000026389\n",
    "7\t0.000039159\n",
    "8\t0.000073156\n",
    "9\t0.000148793\n",
    "10\t0.00030885\n",
    "11\t0.000692644\n",
    "12\t0.001506197\n",
    "13\t0.003307426\n",
    "14\t0.00717706\n",
    "15\t0.015458752\n",
    "16\t0.033280211\n",
    "17\t0.071393263\n",
    "18\t0.152663299\n",
    "19\t0.325223417\n",
    "20\t0.691438377\n",
    "21\t1.455103663\n",
    "22\t3.058479201\n",
    "23\t6.4278282\n",
    "24\t13.515026047\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "init_plot()\n",
    "series(fft, 'tab:red')\n",
    "series(fft_sqrt, 'tab:orange')\n",
    "series(fft_rec, 'tab:blue')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": true,
    "toc-nb-collapsed": true
   },
   "source": [
    "## Benchmark c5.2xlarge (64MiB RAM)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": true,
    "toc-nb-collapsed": true
   },
   "source": [
    "On AWS EC2 instance type `c5.2xlarge`. Root drive size increased to 256GiB and a 64GiB swapfile is added.\n",
    "\n",
    "```\n",
    "L1 Instruction-Cache: (32 KiB, 8-way associativity, direct-mapped)\n",
    "L1 Data-Cache: (32 KiB, 8-way associativity, direct-mapped)\n",
    "L2 Unified-Cache: (1024 KiB, 16-way associativity, direct-mapped)\n",
    "L3 Unified-Cache: (24 MiB, 11-way associativity, hash-based-mapping)\n",
    "```\n",
    "\n",
    "Memory is restricted to 64MiB RAM using cgroups:\n",
    "\n",
    "```\n",
    "sudo cgcreate -t $USER:$USER -a $USER:$USER -g memory:limited\n",
    "echo 67108864 > /sys/fs/cgroup/memory/limited/memory.limit_in_bytes\n",
    "cgexec -g memory:limited ./fft\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft_heap = np.fromstring(\"\"\"\n",
    "1\t0.00034372\n",
    "2\t0.000039486\n",
    "3\t0.000035912\n",
    "4\t0.000037605\n",
    "5\t0.000041504\n",
    "6\t0.000042154\n",
    "7\t0.000055761\n",
    "8\t0.000055802\n",
    "9\t0.000086509\n",
    "10\t0.000104544\n",
    "11\t0.000249147\n",
    "12\t0.000277132\n",
    "13\t0.000917944\n",
    "14\t0.001096818\n",
    "15\t0.00341159\n",
    "16\t0.004738028\n",
    "17\t0.015229078\n",
    "18\t0.020695995\n",
    "19\t0.070773651\n",
    "20\t0.091959886\n",
    "21\t30.928260041\n",
    "22\t40.091185241\n",
    "23\t163.181767742\n",
    "24\t184.1793349\n",
    "25\t832.334289622\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft_mmap = np.fromstring(\"\"\"\n",
    "1\t0.000327282\n",
    "2\t0.000035651\n",
    "3\t0.000028616\n",
    "4\t0.000040656\n",
    "5\t0.000043662\n",
    "6\t0.000036332\n",
    "7\t0.000054138\n",
    "8\t0.000056564\n",
    "9\t0.000078225\n",
    "10\t0.000108604\n",
    "11\t0.000244958\n",
    "12\t0.000287139\n",
    "13\t0.000859705\n",
    "14\t0.001116579\n",
    "15\t0.003552164\n",
    "16\t0.004715079\n",
    "17\t0.015413409\n",
    "18\t0.021136953\n",
    "19\t0.073038566\n",
    "20\t0.092504724\n",
    "21\t76.226779587\n",
    "22\t68.887253346\n",
    "23\t726.08383654\n",
    "24\t527.209818911\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "transpose_heap = np.fromstring(\"\"\"\n",
    "1\t0.000000407\n",
    "2\t0.000000201\n",
    "3\t0.000000508\n",
    "4\t0.000000106\n",
    "5\t0.000000378\n",
    "6\t0.000000184\n",
    "7\t0.000004731\n",
    "8\t0.000000604\n",
    "9\t0.000007464\n",
    "10\t0.000001546\n",
    "11\t0.000031108\n",
    "12\t0.000005326\n",
    "13\t0.00012615\n",
    "14\t0.000028176\n",
    "15\t0.000708114\n",
    "16\t0.000138124\n",
    "17\t0.003178672\n",
    "18\t0.000945967\n",
    "19\t0.015287759\n",
    "20\t0.005595849\n",
    "21\t10.570897667\n",
    "22\t11.281882518\n",
    "23\t48.99322635\n",
    "24\t44.853152764\n",
    "25\t685.421398633\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "transpose_mmap = np.fromstring(\"\"\"\n",
    "1\t0.000001035\n",
    "2\t0.000000202\n",
    "3\t0.000000511\n",
    "4\t0.000000152\n",
    "5\t0.000000656\n",
    "6\t0.000000288\n",
    "7\t0.000007064\n",
    "8\t0.000000804\n",
    "9\t0.000011185\n",
    "10\t0.000002899\n",
    "11\t0.000031892\n",
    "12\t0.00000538\n",
    "13\t0.000134255\n",
    "14\t0.000028246\n",
    "15\t0.000710412\n",
    "16\t0.000142058\n",
    "17\t0.003230439\n",
    "18\t0.001012398\n",
    "19\t0.015829714\n",
    "20\t0.005500206\n",
    "21\t10.533343567\n",
    "22\t11.358831156\n",
    "23\t48.803206975\n",
    "24\t44.780711319\n",
    "25\t186.202560067\n",
    "26\t145.289483772\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "init_plot(ram=64*1024**2)\n",
    "series(fft_heap, 'tab:red')\n",
    "series(fft_mmap, 'tab:blue')\n",
    "series(transpose_heap, 'tab:orange')\n",
    "series(transpose_mmap, 'tab:cyan')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Benchmark single thread"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "c5.2xlarge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft = np.fromstring(\"\"\"\n",
    "1\t0.000010837\n",
    "2\t0.00003163\n",
    "3\t0.000021725\n",
    "4\t0.000027168\n",
    "5\t0.000025975\n",
    "6\t0.000034325\n",
    "7\t0.000043394\n",
    "8\t0.000069395\n",
    "9\t0.00013186\n",
    "10\t0.000252507\n",
    "11\t0.000502172\n",
    "12\t0.001078727\n",
    "13\t0.00234277\n",
    "14\t0.00504515\n",
    "15\t0.010528185\n",
    "16\t0.022451856\n",
    "17\t0.046697365\n",
    "18\t0.099328349\n",
    "19\t0.207932491\n",
    "20\t0.444358145\n",
    "21\t0.93529772\n",
    "22\t1.964368904\n",
    "23\t4.132257624\n",
    "24\t8.600371875\n",
    "25\t18.178107202\n",
    "26\t37.357505465\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft_iterative = np.fromstring(\"\"\"\n",
    "1\t0.000005481\n",
    "2\t0.000004502\n",
    "3\t0.00000692\n",
    "4\t0.000008215\n",
    "5\t0.000011034\n",
    "6\t0.000020269\n",
    "7\t0.000032115\n",
    "8\t0.000056194\n",
    "9\t0.000098589\n",
    "10\t0.000203465\n",
    "11\t0.000447601\n",
    "12\t0.000986478\n",
    "13\t0.002059617\n",
    "14\t0.004453699\n",
    "15\t0.009803471\n",
    "16\t0.022309177\n",
    "17\t0.047588306\n",
    "18\t0.106476312\n",
    "19\t0.26432274\n",
    "20\t0.60108076\n",
    "21\t1.2733614389999999\n",
    "22\t2.688171277\n",
    "23\t5.689161517\n",
    "24\t12.132268398\n",
    "25\t25.850465222\n",
    "26\t55.871878074\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft_depth_first = np.fromstring(\"\"\"\n",
    "1\t0.000005586\n",
    "2\t0.000005259\n",
    "3\t0.000009514\n",
    "4\t0.000010033\n",
    "5\t0.000013038\n",
    "6\t0.000027076\n",
    "7\t0.000037728\n",
    "8\t0.000062199\n",
    "9\t0.000136259\n",
    "10\t0.000282032\n",
    "11\t0.000628744\n",
    "12\t0.001382235\n",
    "13\t0.003047723\n",
    "14\t0.006714292\n",
    "15\t0.014992353\n",
    "16\t0.033965664\n",
    "17\t0.073531157\n",
    "18\t0.170479579\n",
    "19\t0.409346494\n",
    "20\t1.049937156\n",
    "21\t2.444037146\n",
    "22\t5.505871476\n",
    "23\t12.131428218\n",
    "24\t26.768131456\n",
    "25\t61.079266758\n",
    "26\t133.57830445\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft_recursive = np.fromstring(\"\"\"\n",
    "1\t0.000005541\n",
    "2\t0.000004514\n",
    "3\t0.000004572\n",
    "4\t0.000005506\n",
    "5\t0.000008072\n",
    "6\t0.000014245\n",
    "7\t0.000031183\n",
    "8\t0.00006486\n",
    "9\t0.000143123\n",
    "10\t0.000324709\n",
    "11\t0.000689698\n",
    "12\t0.00150638\n",
    "13\t0.003293801\n",
    "14\t0.007106039\n",
    "15\t0.01528987\n",
    "16\t0.032758035\n",
    "17\t0.069945614\n",
    "18\t0.148304918\n",
    "19\t0.315367176\n",
    "20\t0.670398505\n",
    "21\t1.417594193\n",
    "22\t2.980605325\n",
    "23\t6.24611864\n",
    "24\t13.096406455\n",
    "25\t27.498664066\n",
    "26\t57.386700655\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "init_plot()\n",
    "series(fft, 'black')\n",
    "series(fft_iterative, 'tab:blue')\n",
    "series(fft_depth_first, 'tab:orange')\n",
    "series(fft_recursive, 'tab:pink')\n",
    "series(fft2_heap, 'tab:cyan')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": true,
    "toc-nb-collapsed": true
   },
   "source": [
    "## Memory access pattern"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fft_df(values, size, offset, stride, loop):\n",
    "    if size == 1:\n",
    "        values += [offset]\n",
    "    else:\n",
    "        if stride == loop and loop < 128:\n",
    "            fft_df(values, size // 2, offset, 2 * stride, 2 * loop)\n",
    "        else:\n",
    "            fft_df(values, size // 2, offset, 2 * stride, loop)\n",
    "            fft_df(values, size // 2, offset + stride, 2 * stride, loop)\n",
    "        for i in range(size // 2):\n",
    "            for j in range(loop):\n",
    "                values += [offset + 2 * i * stride + j]\n",
    "                values += [offset + 2 * i * stride + j + stride]\n",
    "    return values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = fft_df([], 16384, 0, 1, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(a, linestyle='', marker='.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "2**15 / 64 / 8"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Threads"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fft_threads = np.fromstring(\"\"\"\n",
    "24\t8.602388099\n",
    "24\t4.433958019\n",
    "24\t3.055124852\n",
    "24\t2.35013958\n",
    "24\t2.18961738\n",
    "24\t2.051058217\n",
    "24\t1.933472554\n",
    "24\t1.826348783\n",
    "24\t1.8278633229999999\n",
    "24\t1.837789269\n",
    "\"\"\", sep=' ').reshape((-1, 2))[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "threads = np.array(range(1,20))\n",
    "plt.xticks(threads, [str(n) for n in threads])\n",
    "plt.axvline(4, color='grey', linestyle='--')\n",
    "plt.text(4, 1, 'Cores')\n",
    "plt.axvline(8, color='grey', linestyle='--')\n",
    "plt.text(8, 1, 'Hyper threads')\n",
    "plt.plot(threads[:len(fft_threads)], fft_threads * threads[:len(fft_threads)] / fft_threads[0], marker = '.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "threads[:len(fft_threads)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "2.99\n",
    "\n",
    "\n",
    "$69.7 billion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "104 / 2.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
